In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
import warnings 
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score
import seaborn as sns
In [2]:
######################################## Import Data From Quandl API ########################################################## 

#import quandl
#import pandas as pd
#import xlsxwriter
##mydata = quandl.get_table('ZACKS/FC', ticker='AAPL')
#quandl.ApiConfig.api_key = "XXXXXXXXXXXXXX" ########## Register and generate a key ##############
#mydata = quandl.get("FRED/GDP")

#excelfile= r'dailyd2.xlsx' 
#workbook = xlsxwriter.Workbook(excelfile)
#worksheet1 = workbook.add_worksheet('daily')
#bold = workbook.add_format({'bold': True})

#row = 0 
#worksheet1.write(row,0,'Date')
#mydata = quandl.get('WIKI/AAPL', start_date="2017-01-01", end_date="2017-08-25")
#daterange  = mydata.index ############# Save the date range ###############
#for i in daterange:
#    row = row+1
#    worksheet1.write(row,0,i)    


#data = pd.read_csv(r'0825_quandl_ticks2.csv') ########### Load the list of tickers ############# 
#tlist = data['Ticker']
#col = 0
#for tl in tlist: 
#    try:
#        mydata = quandl.get('WIKI/'+tl, start_date="2017-01-01", end_date="2017-08-25")
#        if len(mydata)-1 == len(daterange):
#            row = 0
#            col = col+1
#            worksheet1.write(row,col,tl,bold)       
#            for dr in daterange:
#                row = row+1                   
#                for i,j in mydata['Adj. Close'].iteritems():
#                    if dr==i:
#                        worksheet1.write(row,col,j) 
#            print(tl,j)    
#    except Exception as e:
#        print(e)
#workbook.close()    
In [3]:
######## Load Data #########
data = pd.read_excel(r'C:\ModelData\dailyd2.xlsx', index_col='Date', parse_dates=True)
In [4]:
################ Calculate and plot volatility #####################

df_stat = pd.DataFrame(columns = ['std','mean','normalized_std'])
df_stat[['std','mean','normalized_std']]  = pd.DataFrame([data.std(),data.mean(),data.std()/data.mean()]).T
df_stat.sort_values('normalized_std')
plt.title('Normalized Standard Deviation')
plt.hist(df_stat['normalized_std'])
fig = plt.gcf()
fig.set_size_inches(14.5, 5.5)
plt.show()
In [5]:
######### Based on the graph above filter out stocks with high variance and low returns#############
df_fltr = df_stat[df_stat['normalized_std']<.15]
df = data[df_fltr.index]
tkl  = [x for x in df if (df[x][-1]-df[x][0])/df[x][0]>=.25 ]
df = df[tkl]

fltr_tk = []
for tk in df:
    Y = df[tk].as_matrix()
    X = range(len(df.index))
    X = sm.add_constant(X)
    model = sm.OLS(Y,X)
    results = model.fit()
    if results.params[1] > .01: ########## Filter on slope coefficient from Regression Model #########
        fltr_tk.append(tk)
        df.plot(y=[tk])
        plt.title('Daily '+tk+' Stock Prices 2017-01-01 to 2017-07-31')
        plt.legend(loc='upper left')
        plt.axvspan('2017-08-01','2017-08-25', color='green', alpha=0.25)
        fig = plt.gcf()
        fig.set_size_inches(16.5, 4.5)
        plt.show()
In [6]:
################ PLot Trend and Seasonality #########################

for tk in fltr_tk:        
        df_d = df[tk]
        adj_index = pd.date_range(df_d.index[0], periods=len(df), freq='D')
        df_d.index = adj_index
        decomposition = seasonal_decompose(df_d)
        trend = decomposition.trend
        seasonal = decomposition.seasonal
        seasonal.index = df.index 
        trend.index = df.index
        df_d.index = df.index
        plt.subplot(1,2,1)
        plt.plot(trend,label='Trend')
        plt.title(tk)
        plt.xticks(rotation=90)
        plt.legend(loc='best')
        plt.subplot(1,2,2)
        plt.plot(seasonal,label='Seasonality')
        plt.title(tk)
        plt.xticks(rotation=90)
        plt.legend(loc='best')
        fig = plt.gcf()
        fig.set_size_inches(16.5, 4.5)
        plt.show()
In [7]:
############### Run ARIMA Model ###################

store = {}
for tk in fltr_tk:
    train = df[tk][0:-20]
    test = df[tk][len(train):]
    ap = 99
    ad = 99
    aq = 99
    amape = 99
    af = []
    for p in range(10):
        for q in range(10):
            for d in range(2): 
                try:
                    model = ARIMA(train, order=(p, d, q)).fit()
                    predict = model.forecast(len(test))
                    fcst=predict[0]
                    mapelist = []
                    for i in range(len(fcst)):
                        mapelist.insert(i, (np.absolute(test[i] - fcst[i])) / test[i])
                    mape = np.mean(mapelist) * 100
                    mape = round(mape,2)
                except:
                    mape = 9999
                    pass
                if amape > mape:
                    amape = mape
                    ap  = p
                    ad = d
                    aq = q
                    af= fcst
    store[tk] = af
    plt.plot(train)
    plt.plot(test,label='Actual')
    plt.plot(test.index,af,label='Predicted')
    fig = plt.gcf()
    fig.set_size_inches(16.5, 4.5)
    plt.title(str(tk)+"_"+"MAPE"+"_"+str(amape)+"_"+"Order"+"_"+"("+str(ap)+str(ad)+str(aq)+")")
    plt.legend(loc='best')
    plt.show()
In [8]:
###### Correlation HeatMap #######
corr  = df[fltr_tk].corr(method='pearson', min_periods=1).abs()
ax = sns.heatmap(corr);
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
ax.set_title('PortFolio Diversity HeatMap');
plt.show()
In [9]:
##### Test model accuracy #####

eval_metrcs = pd.DataFrame(columns = ['Actual','Predicted','Actual_Growth','Predicted_Growth'],index = fltr_tk)

for tk in fltr_tk:
    train = df[tk][0:-20]
    test = df[tk][len(train):]
    if (train[-1] >= test[-1] and train[-1] >= store[tk][-1]):
        eval_metrcs['Actual'].loc[eval_metrcs.index == tk] = 0
        eval_metrcs['Predicted'].loc[eval_metrcs.index == tk] = 0
        eval_metrcs['Actual_Growth'].loc[eval_metrcs.index == tk] = (test[-1] - train[-1])/train[-1]
        eval_metrcs['Predicted_Growth'].loc[eval_metrcs.index == tk] = (store[tk][-1] - train[-1])/train[-1]        
    elif (train[-1] < test[-1] and train[-1] < store[tk][-1]):
        eval_metrcs['Actual'].loc[eval_metrcs.index == tk] = 1
        eval_metrcs['Predicted'].loc[eval_metrcs.index == tk] = 1
        eval_metrcs['Actual_Growth'].loc[eval_metrcs.index == tk] = (test[-1] - train[-1])/train[-1]
        eval_metrcs['Predicted_Growth'].loc[eval_metrcs.index == tk] = (store[tk][-1] - train[-1])/train[-1]
    elif (train[-1] >= test[-1] and train[-1] < store[tk][-1]):
        eval_metrcs['Actual'].loc[eval_metrcs.index == tk] = 0
        eval_metrcs['Predicted'].loc[eval_metrcs.index == tk] = 1
        eval_metrcs['Actual_Growth'].loc[eval_metrcs.index == tk] = (test[-1] - train[-1])/train[-1]
        eval_metrcs['Predicted_Growth'].loc[eval_metrcs.index == tk] = (store[tk][-1] - train[-1])/train[-1]
    elif (train[-1] < test[-1] and train[-1] >= store[tk][-1]):
        eval_metrcs['Actual'].loc[eval_metrcs.index == tk] = 1
        eval_metrcs['Predicted'].loc[eval_metrcs.index == tk] = 0
        eval_metrcs['Actual_Growth'].loc[eval_metrcs.index == tk] = (test[-1] - train[-1])/train[-1]
        eval_metrcs['Predicted_Growth'].loc[eval_metrcs.index == tk] = (store[tk][-1] - train[-1])/train[-1]

eval_metrcs
Out[9]:
Actual Predicted Actual_Growth Predicted_Growth
HPQ 1 0 0.00520562 -0.017592
BGCP 1 1 0.0344432 0.0274154
BLDR 0 1 -0.0562743 0.0407847
CZR 0 0 -0.113725 -0.0310849
CAMP 0 0 -0.0433884 -0.0357415
CPN 1 1 0.0288124 0.0241479
CWST 0 0 -0.0268692 -0.0257216
CENX 1 0 0.00892359 -0.21757
COHU 0 0 -0.0932919 -0.131128
DAR 1 0 0.0654545 -0.0141167
FOE 1 1 0.0202128 0.00680254
GSM 1 1 0.0460829 0.0259386
GTN 0 0 -0.0562914 -0.100722
IXYS 0 0 -0.0859599 -0.0192517
MDCA 0 0 -0.00980392 -0.00131282
MTOR 1 1 0.0772311 0.0496162
MGI 0 0 -0.0295567 -0.0605303
MYE 1 1 0.0314286 0.031333
NYT 0 0 -0.085213 -0.023136
ORBC 0 0 -0.0545145 -0.0124334
PNK 1 1 0.00455005 0.0167042
RDNT 1 1 0.251613 0.0372675
SEM 1 1 0.0617284 0.0590196
TGH 1 0 0.0125786 -0.0381664
USAP 0 0 -0.0281081 -0.0258438
NLY 1 1 0.0403361 0.0491873
FOR 0 1 0 0.000157287
FORM 1 1 0.0792453 0.0899497
RSO 1 1 0.0264188 0.0305759
GAIA 0 0 -0.0214592 -0.0252127
In [10]:
model_accuracy = accuracy_score(eval_metrcs['Actual'].astype(int),eval_metrcs['Predicted'].astype(int))
model_accuracy
Out[10]:
0.80000000000000004